Import libraries

source("R/bioage_estimate_median.R")
source("R/gompertz_draw.R")
source("R/weibull_draw.R")
source("R/generate_data.R")
library(tidyverse)
library(MASS)
library(psbcGroup)

Contents “generate_data.R” file

Wrapper

generate_data = function(pop_n, obs_n, p, g, ltname, betafrac, seed, force_recalc, X_plots, a = exp(-9), b = 0.085, X_rho, β_rho, non_zero_betas, X_scale, active_hazard, betasep, bscale) {
  
  print("Generating βs...")
  betas = generate_betas(p = p, g=g, rho = β_rho, rho_between = 0,
                         seed = seed, mu_u = betasep, mu_l = -betasep, beta_scale = bscale,
                         beta_denom = betafrac, plot = T, non_zero_groups = non_zero_betas, active_hazard = active_hazard)
  
  # betas$beta = betas$beta + 0.01
  
  
  print("Generating X...")
  X = generate_X(n = pop_n, p = p, g = g, rho = X_rho, seed = seed,
                 rho_between = 0,
                 X_plots = X_plots, scale = X_scale)
  print("Done!")

  
  
  print("Generating population lifetable...")
  lt = generate_population_lifetable(N_pop = pop_n, M = p, betas = betas$beta,
                                   X = X$X, filename = ltname,
                                   seed = seed, force_recalc = force_recalc, a=a, b=b)
  print("Done!")
  
  plot = ggplot(lt, aes(x = t, y = mrl)) +
    geom_line() +
    labs(
      title = "Population Lifetable MRL",
      x = "Chronological age (years)",
      y = "Mean Residual Life"
    ) +
    theme_minimal()
  print(plot)
  
  print("Generating data from lifetable...")
  df_sim = create_dataset(M = p, n_obs = obs_n, G = g,
                          gsize = (p/g), lt = lt, betas = betas$beta,
                          seednr = seed+1, X_plots = F, a=a, b=b, X_rho = X_rho,
                          X_scale = X_scale, followup = 20)
  
  return(list(df = df_sim, true_betas = betas))
}

Function performance

source("R/generate_data.R")

aft_reg = function(df_sim, p) {
  true_betas = df_sim$true_betas
  df_sim = df_sim$df
  
  
  # Train/test split 50/50
  train_indices = sample(1:nrow(df_sim), nrow(df_sim)/2)
  
  df_sim_train = df_sim[train_indices,]
  df_sim_test = df_sim[-train_indices,]
  
  # Create the formula for AFT regression
  predictor_vars = paste(c(glue("x{1:p}")), collapse = " + ")
  aft_formula = as.formula(paste("Surv(age_start, age_end, status) ~", predictor_vars))
  
  # Fit the AFT model
  fit_aft = tryCatch({
      fit_aft = eha::aftreg(
        formula = aft_formula, 
        data = df_sim_train, 
        dist = "gompertz",
        control = list(
          maxit = 100,
          trace = FALSE
        )
      )
      fit_aft
    }, 
    error = function(e) {
      message("AFT model error: ", e$message)
      NA
  })
  
  # If model fitting failed, return NA
  if (is.na(fit_aft)[1]) {
    return(NA)
  }

  
  # get estimated parameters
  sigma_est <- exp(fit_aft$coefficients["log(scale)"]) # canonical parametrization
  tau_est <- exp(fit_aft$coefficients["log(shape)"]) # canonical parametrization
  a_est <- tau_est / sigma_est
  b_est <- 1 / sigma_est
  
  betas_est_aft <- fit_aft$coefficients[1:p]
  linpred_aft <- rowSums(sweep(df_sim_test[,1:p], 2, betas_est_aft, "*")) # SCALE
  
  for (i in 1:nrow(df_sim_test)){
    mrl_uncon <- integrate(gomp_baseline_surv, lower = (df_sim_test$age_start[i] * exp(linpred_aft[i])), 
                           upper=Inf, a = a_est, b = b_est)$value
    s_cond <-  gomp_baseline_surv(df_sim_test$age_start[i] * exp(linpred_aft[i]), a = a_est, b = b_est) 
    df_sim_test$pred_mrl[i] <- (mrl_uncon / s_cond) * exp(-linpred_aft[i])
  }
  
  plt = df_sim_test %>% ggplot(aes(x = mrl, y = pred_mrl)) +
    geom_point() +
    geom_abline(color = 'red') +
    theme_minimal()
  
  print(plt)
  
  # Get the bias
  aftreg_coef_rmse = sqrt(mean((betas_est_aft - true_betas$beta)^2))
  
    
  # Get the RMSE
  aftreg_rmse = sqrt(mean((df_sim_test$mrl - df_sim_test$pred_mrl)^2))
  
  # Plot difference between true and estimates betas
  true_betas$betahat = betas_est_aft
  
  plt = true_betas %>%
    mutate(index = 1:p) %>%
    ggplot(aes(x = index)) +
    # Vertical lines between pred and actual
    geom_segment(aes(xend = index, y = beta, yend = betahat), color = "gray") +
    geom_point(aes(y = beta, color = "True")) +
    geom_point(aes(y = betahat, color = "Predicted")) +
    scale_color_manual(values = c("True" = "black", "Predicted" = "skyblue"),
                       name = "Type") +
    ylab("β") +
    xlab("Index") +
    labs(title = "True vs Predicted β-coefficients for AFT", subtitle = glue("RMSE = {aftreg_coef_rmse %>% round(4)}")) +
    theme_minimal()
  print(plt)
  
  return(list(prediction = aftreg_rmse, coefficients = aftreg_coef_rmse))
}

psbc_reg = function(df_sim, p, g) {
  true_betas = df_sim$true_betas
  df_sim = df_sim$df
  
  # Train/test split 50/50
  train_indices = sample(1:nrow(df_sim), nrow(df_sim)/2)
  
  df_sim_train = df_sim[train_indices,]
  df_sim_test = df_sim[-train_indices,]
  
  Y_train = cbind(df_sim_train$age_start, df_sim_train$age_end, df_sim_train$status)
  Y_test = cbind(df_sim_test$age_start, df_sim_test$age_end, df_sim_test$status)
  
  # Create covariate matrix X
  X_train = as.matrix(df_sim_train[, paste0("x", 1:p)])
  X_test = as.matrix(df_sim_test[, paste0("x", 1:p)])
  
  XC = NULL # CONFOUNDERS
  
  # Create group indicator for each variable
  grpInx = rep(1:g, each = p/g)
  
  # Set hyperparameters (PRIOR)?
  hyperParams = list(
    a.sigSq = 0.01, b.sigSq = 0.01,
    mu0 = 0, h0 = 0.01,
    v = 0.1
  )
  
  # Set starting values https://cran.r-project.org/web/packages/psbcGroup/psbcGroup.pdf
  w = log(Y_train[,1])
  mu = 0.1
  beta = rep(0.1, p)
  sigSq = 0.5
  tauSq = rep(0.4, p)
  lambdaSq = 10
  betaC = rep(0.11, 0) # no conf
  startValues = list(w=w, beta=beta, tauSq=tauSq, mu=mu, sigSq=sigSq,
  lambdaSq=lambdaSq, betaC=betaC)
  
  # MCMC parameters format __ CHANGE __ 
  mcmcParams = list(
    run = list(
      numReps = 5000,
      thin = 5, # Update params every n steps
      burninPerc = 0.2 # Discard the first ... steps
    ),
    tuning = list(
      mu.prop.var = 0.1,
      sigSq.prop.var = 0.005,
      L.beC = 50,
      M.beC = 1,
      eps.beC = 0.001,
      L.be = 100,
      M.be = 1,
      eps.be = 0.001
    )
  )
  
  fit_bayes_aft = aftGL_LT(Y_train, X_train, XC, grpInx, hyperParams, startValues, mcmcParams)
  
  # Extract betas using the maximum likelihood via density
  get_mode = function(x){
    xdist = density(x)
    mode = xdist$x[which.max(xdist$y)]
    return(mode)
  }
  betas = fit_bayes_aft$beta.p %>% apply(MARGIN = 2, FUN = get_mode)
  # Make linear predictor for MRL
  X_test = X_test %>% as.matrix
  linpred = X_test %*% betas
  
  # Change to lognormal
  # Create baseline lognormal survival
  # Implemented in R -> check log scales
  # dlnorm, use any of them 1 - F(x)
  # instead of gomp_baseline_surv no false, 1-
  
  
  # a = exp(-9)
  # b = 0.085
  
  # 1-plnorm(1:100, meanlog = fit_bayes_aft$mu.p[120], sdlog = fit_bayes_aft$sigSq.p[120])
  
  # Estimate mean from MCMC draws
  mu_est = mean(fit_bayes_aft$mu.p)
  sigSq_est = mean(fit_bayes_aft$sigSq.p)

  for (i in 1:nrow(df_sim_test)){
    mrl_uncon = integrate(function(x) 1 - plnorm(x, meanlog = mu_est, sdlog = sqrt(sigSq_est)), # Var to Sd via sqrt
                       lower = (df_sim_test$age_start[i] * exp(linpred[i])), 
                       upper = Inf)$value
    s_cond = 1 - plnorm(df_sim_test$age_start[i] * exp(linpred[i]), meanlog = mu_est, sdlog = sqrt(sigSq_est))
    df_sim_test$pred_mrl[i] <- (mrl_uncon / s_cond) * exp(-linpred[i])
  }
  
  plt = df_sim_test %>% ggplot(aes(x = mrl, y = pred_mrl)) +
    geom_point() +
    geom_abline(color = 'red') +
    theme_minimal()
  
  print(plt)
  
  
  # Get the RMSE
  pred_RMSE = sqrt(mean((df_sim_test$mrl - df_sim_test$pred_mrl)^2))
  
  beta_RMSE =sqrt(mean((betas - true_betas$beta)^2))
  
  # Plot difference between true and estimates betas
  true_betas$betahat = betas
  
  plt = true_betas %>%
    mutate(index = 1:p) %>%
    ggplot(aes(x = index)) +
    # Vertical lines between pred and actual
    geom_segment(aes(xend = index, y = beta, yend = betahat), color = "gray") +
    geom_point(aes(y = beta, color = "True")) +
    geom_point(aes(y = betahat, color = "Predicted")) +
    scale_color_manual(values = c("True" = "black", "Predicted" = "skyblue"),
                       name = "Type") +
    ylab("β") +
    xlab("Index") +
    labs(title = "True vs Predicted β-coefficients for Bayesian AFT", subtitle = glue("RMSE = {beta_RMSE %>% round(4)}")) +
    theme_minimal()
  print(plt)
  
  return(list(prediction = pred_RMSE, coefficients = beta_RMSE))

}

run_experiment = function(pop_n = 20000, obs_n = 1000, p = 20, g = 4, ltname, bfrac, seed, nzb, bsep, X_rho) {
  
  print("<<<<GENERATING SIMULATED DATA>>>>")
  df_sim = generate_data(pop_n = pop_n, obs_n = obs_n, p = p, g = g,
                       ltname = ltname, betafrac = bfrac, seed = 123,
                       force_recalc = F, X_plots = T, a = exp(-9), b = 0.085,
                       X_rho = X_rho, β_rho = 0.9, non_zero_betas = nzb, X_scale = 1,
                       #active_hazard = 0.025, betasep = 2)
                       active_hazard = (0.05*sqrt(p))/nzb/bfrac/bsep,
                       betasep = bsep, bscale = 0.25)
  
  set.seed(seed+1)
  print("<<<<RUNNING AFT>>>>")
  rmse_aft = aft_reg(df_sim = df_sim, p = p)
  
  set.seed(seed+1)
  print("<<<<RUNNING PSBC BAYESIAN>>>>")
  rmse_psbc = psbc_reg(df_sim = df_sim, p = p, g = g)
  
  return(list(aft = rmse_aft, psbc = rmse_psbc))
}

Experiments

# p = 20
rmses = run_experiment(p = 20, g = 4, ltname = "p20_may20", bfrac = 40, seed = 42, nzb = 0.5, bsep = 2, X_rho = 0)
## [1] "<<<<GENERATING SIMULATED DATA>>>>"
## [1] "Generating βs..."
## Mean of betas pre-scaling: 0.215071322236047
## Lower range of beta values pre-scaling: -0.825018359139732
## Upper range of beta values pre-scaling: 1.69028592387452

## [1] "Generating X..."
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the tibble package.
##   Please report the issue at <https://github.com/tidyverse/tibble/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## [1] "Done!"
## [1] "Generating population lifetable..."
## [1] "Done!"

## [1] "Generating data from lifetable..."
## Warning in create_dataset(M = p, n_obs = obs_n, G = g, gsize = (p/g), lt = lt,
## : Only 791 valid cases where age_start < age_death, but 1000 requested.
## Nr of valid indices: 791
## Range of linpred values: -0.1849455 0.2802615 
## [1] "<<<<RUNNING AFT>>>>"

## [1] "<<<<RUNNING PSBC BAYESIAN>>>>"
## iteration: 1000: Wed May 28 12:04:41 2025
## 
## iteration: 2000: Wed May 28 12:05:08 2025
## 
## iteration: 3000: Wed May 28 12:05:35 2025
## 
## iteration: 4000: Wed May 28 12:06:01 2025
## 
## iteration: 5000: Wed May 28 12:06:28 2025

print(rmses)
## $aft
## $aft$prediction
## [1] 4.485843
## 
## $aft$coefficients
## [1] 0.01431187
## 
## 
## $psbc
## $psbc$prediction
## [1] 15.11173
## 
## $psbc$coefficients
## [1] 0.01585668
rmses = run_experiment(p = 60, g = 4, ltname = "p60_may20", bfrac = 40, seed = 43, nzb = 0.5, bsep = 2, X_rho = 0)
## [1] "<<<<GENERATING SIMULATED DATA>>>>"
## [1] "Generating βs..."
## Mean of betas pre-scaling: 0.0701313093887515
## Lower range of beta values pre-scaling: -0.68035102718843
## Upper range of beta values pre-scaling: 1.18615518586129

## [1] "Generating X..."

## [1] "Done!"
## [1] "Generating population lifetable..."
## Range of death: 0.432065781729277
## Range of death: 127.549335244695

## [1] "Done!"

## [1] "Generating data from lifetable..."
## Warning in create_dataset(M = p, n_obs = obs_n, G = g, gsize = (p/g), lt = lt,
## : Only 809 valid cases where age_start < age_death, but 1000 requested.
## Nr of valid indices: 809
## Range of linpred values: -0.2904009 0.2777132 
## [1] "<<<<RUNNING AFT>>>>"

## [1] "<<<<RUNNING PSBC BAYESIAN>>>>"
## iteration: 1000: Wed May 28 12:07:25 2025
## 
## iteration: 2000: Wed May 28 12:08:11 2025
## 
## iteration: 3000: Wed May 28 12:08:59 2025
## 
## iteration: 4000: Wed May 28 12:09:47 2025
## 
## iteration: 5000: Wed May 28 12:10:34 2025

print(rmses)
## $aft
## $aft$prediction
## [1] 10.44082
## 
## $aft$coefficients
## [1] 0.01816652
## 
## 
## $psbc
## $psbc$prediction
## [1] 16.92182
## 
## $psbc$coefficients
## [1] 0.01612463
# 
# rmses = run_experiment(p = 100, g = 4, ltname = "p100_may20", bfrac = 40, seed = 44, nzb = 0.5, bsep = 2, X_rho = 0)
# print(rmses)
# 
rmses = run_experiment(p = 200, g = 4, ltname = "p200_may20", bfrac = 40, seed = 45, nzb = 0.5, bsep = 2, X_rho = 0)
## [1] "<<<<GENERATING SIMULATED DATA>>>>"
## [1] "Generating βs..."
## Mean of betas pre-scaling: 0.148329628219689
## Lower range of beta values pre-scaling: -1.13669894797518
## Upper range of beta values pre-scaling: 1.65937506505303

## [1] "Generating X..."

## [1] "Done!"
## [1] "Generating population lifetable..."
## Range of death: 0.588888986973458
## Range of death: 150

## [1] "Done!"

## [1] "Generating data from lifetable..."
## Warning in create_dataset(M = p, n_obs = obs_n, G = g, gsize = (p/g), lt = lt,
## : Only 767 valid cases where age_start < age_death, but 1000 requested.
## Nr of valid indices: 767
## Range of linpred values: -0.8091164 1.047817 
## [1] "<<<<RUNNING AFT>>>>"
## AFT model error: Singular hessian; suspicious variable No. TRUE:
##  = 
## Try running with fixed shape
## [1] "<<<<RUNNING PSBC BAYESIAN>>>>"
## iteration: 1000: Wed May 28 12:14:07 2025
## 
## iteration: 2000: Wed May 28 12:16:22 2025
## 
## iteration: 3000: Wed May 28 12:18:37 2025
## 
## iteration: 4000: Wed May 28 12:20:51 2025
## 
## iteration: 5000: Wed May 28 12:23:04 2025

print(rmses)
## $aft
## [1] NA
## 
## $psbc
## $psbc$prediction
## [1] 24.8586
## 
## $psbc$coefficients
## [1] 0.01872657

Monte carlo

# Parameters grid:
# p
# g
# 
# MC = replicate(10000, {
#   
# })